


#
import requests
from bs4 import BeautifulSoup
import json

#分页下载网页内容  宋词三百首(300首)
#https://www.shicimingju.com/shicimark/songcisanbaishou.html
#https://www.shicimingju.com/shicimark/songcisanbaishou_2_0__0.html
#https://www.shicimingju.com/shicimark/songcisanbaishou_3_0__0.html
def get_request_page():
    # 设置请求头，使其看起来像是从浏览器发出的
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
        # 可以根据需要添加更多的请求头
    }
    url = "https://www.shicimingju.com/shicimark/songcisanbaishou_2_0__0.html"
    with open(file="tangPoetry.csv", mode="w", encoding="utf8") as f:
        for page_no in range(1, 20):
            url = f"https://www.shicimingju.com/shicimark/songcisanbaishou_{page_no}_0__0.html"
            print(f"正在爬取第{page_no}页 ... ")
            try:
                response = requests.get(url=url,  headers =headers)
                response_text = response.text
                print(response_text)
                soup = BeautifulSoup(markup=response_text, features='html.parser')
                # 积分榜
                tang_poetry_name_list = soup.findAll(name='div', attrs={'class': 'list_num_info'})
                title_list = soup.findAll(name='div', attrs={'class': 'shici_list_main'})
                content_list = soup.findAll(name='div', attrs={'class': 'shici_content'})

                for tang_poetry_name , title , content in zip(tang_poetry_name_list , title_list , content_list):
                    tang_poetry_name = tang_poetry_name.text.replace("\n", "").replace(" ", "")
                    title = title.find(name='a').text.replace("\n", "").replace(" ", "")
                    content = content.text.replace("\n", "").replace(" ", "")
                    #print(f"tang_poetry_name={tang_poetry_name} , title:{title} , content_list:{content}")
                    print(tang_poetry_name , title, content, sep=",", end="\\n", file=f)
                    print("", file=f)
            except Exception as ex:
                print(ex)
                # print(page_content_list)
                print("------------------------------------------------------------------")
                continue
                
    print("搞定")







reponse_text =  get_request_page()



